#################################################################
rm(list=ls())
#################################################################
# Dependencies
#################################################################
# global
library(dplyr)
library(magrittr)
library(pbmcapply)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
getwd()
library(here)
here::here()
annot <- readRDS('../output/02-minified-corpus_2019-10-12-CURATED.RDS')
corpus <- readRDS('../input/SMD_Text_Classified_Sentiment_Weekly.RDS')
corpus %<>% mutate(id=sprintf(paste0('%0',nchar(as.numeric(nrow(corpus))), 'd'), row_number()))
table(annot$doc.id%in%corpus$id)
str(corpus)

# Make DF with all Texts which have no matching Entities:
annot_anti <- anti_join(corpus, annot, by=c("id"="doc.id"))
annot_anti <- filter(annot_anti, !is.na(so))
annot_anti %<>% select(id, so, so_txt, pubDateTime, la, tx, annotation_geography, annotation_person, selectsclass, sentiment_value)
annot_anti %<>% rename(doc.id=id, fullTxt=tx)

# Make DF with all Texts which have one or more matching Entities:
annot <- left_join(annot, corpus, by=c("doc.id"="id"))
annot <- filter(annot, !is.na(so))
annot %<>% select(doc.id, txt, person.id, so, so_txt, pubDateTime, la, tx, annotation_geography, annotation_person, selectsclass, sentiment_value)
annot %<>% rename(snippet=txt, fullTxt=tx)
unfolded <- pbmclapply(1:nrow(annot), function(x){ 
  tmp <- annot[rep(x, length(unlist(annot[x,]$person.id))),] 
  tmp$person.id <- unlist(tmp[1,]$person.id)
  return(tmp)
  }, mc.cores=4)
unfolded <- do.call(rbind, unfolded)

# Make final DF containing only all SMD Articles which have an Entity 
df <- read.csv('../output/00-Named_Entity_List_withID.csv', stringsAsFactors = F) %>% as_tibble %>% mutate(id=as.character(id))
table()
str(df)
table(df$party_short)
str(unfolded)
df <- left_join(unfolded, df, by=c("person.id"="id"))
filter(df, is.na(fullname))
table(df$party_short)

save(df, file='../output/03-annotated-corpus.RDS')
max(df$pubDateTime)


# Add all smd texts with no Entity to the data for a full data set wiith all things....
names(df)
names(annot_anti)
